{% extends "base.html" %} {% block page_content %}
This document describes an estimate of the positive predictive value (PPV) of variant discovery for all variants in the gene KCNH2 on long QT syndrome type 2 (LQT2). Additional details on the methods used are published Kroncke et al. 2020 PLOS Genetics and at the following website: (https://oates.app.vumc.org/vancart/SCN5A/SCN5A-report.html). We use observed and estimated probability of LQT2 diagnosis for all known KCNH2 variants as a way to assess the per variant PPV for variant discovery. Our objective is to develope a prior estimate of the per variant PPV on LQT2 which incorporates structure, function, and in silico predictors. We use these in silico and in vitro data to generate a Bayesian prior estimate of the per variant PPV since these data can be generated in a lab setting, unlike heterozygotes/carriers of KCNH2 variants which may or may not exist. The final posterior estimate combines this derived prior and clinically phenotyped heterozygotes/carriers.
# mut_type has includes type and isoform for all variants in the literature and in the cohort.
mut_type <- mut_type[mut_type$mut_type == "missense",]
# Cohort carrier/heterozygote counts and variant IDs.
# Here we select only the missense variants (in-frame indels are also included as "missense")
cohort.data <- cohort.data[cohort.data$mut_type == "missense" & cohort.data$total_carriers > 0,]
# Literature dataset where potentially overlapping carriers/heterozygotes are removed
d <- lit.nonoverlap.data[lit.nonoverlap.data$mut_type == "missense",]
# Here, heterozygotes/carriers from gnomAD are removed to test their influence on performance
# Remove comments in next two lines and complete subsequent evaluation to assess influence of gnomAD on
# calculations.
#d$total_carriers <- d$total_carriers - d$gnomAD # test influence of gnomAD
#d$unaff <- d$total_carriers - d$lqt2 # test influence of gnomAD
# set initial weighting and penetrance
d$weight = 1-1/(0.01+d$total_carriers)
d$penetrance_lqt2 <- d$lqt2/d$total_carriers
d[d$total_carriers < 1,"weight"] <- 0.000 # This is changed to "< 2" when evaluating ROC-AUC of n=1 variants from the literatureUse observed LQT2 diagnosis probability to calculate “LQTS probability density” as described in previous publication. Plot probability density versus residue
# Mean squared error
mse <- function(sm) {
mean((sm$residuals)^2*(sm$weights))
}
# Weighted mean to determine LQT2 penetrance empirical prior
newdata = data.frame(wt=1)
model <- lm(penetrance_lqt2 ~ 1, data=d, weights = d$weight)
summary(model)##
## Call:
## lm(formula = penetrance_lqt2 ~ 1, data = d, weights = d$weight)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -0.2329 -0.1653 -0.0232 0.0763 0.7561
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2332 0.0135 17.27 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2323 on 789 degrees of freedom
## (467 observations deleted due to missingness)
p<-predict(model, newdata)
dev<- mse(model)#p*(1-p)
# Derive alpha and beta from weighted mean and MSE (estimated variance)
estBetaParams <- function(mu, var) {
alpha <- ((1 - mu) / var - 1 / mu) * mu ^ 2
beta <- alpha * (1 / mu - 1)
return(params = list(alpha = alpha, beta = beta))
}
# Estimated shape parameters for LQT2 empirical prior
alpha0 = estBetaParams(p,dev)$alpha
beta0 = estBetaParams(p,dev)$beta
print(paste("alpha0 = ", alpha0, " beta0 = ", beta0))## [1] "alpha0 = 0.54041284982238 beta0 = 1.77720039413882"
# Bayesian LQT2 penetrance estimates from empirical priors
# and observed affected/unaffected counts:
d$lqt2_penetranceBayesian_initial <- (alpha0 + d[,"lqt2"])/((alpha0 + beta0 + d[,"total_carriers"]))
d$lqt2_penetranceBayesian<-d$lqt2_penetranceBayesian_initial
# Plot literature observed LQT2 penetrance versus residue number
m<- d %>%
select(resnum, pmean = penetrance_lqt2) %>%
mutate(p_mean_smooth = rollmean(pmean, k=20, fill = NA))
fit <- loess(d[,"penetrance_lqt2"]~as.numeric(d[,"resnum"]), span = 0.15)
plot(d$resnum, d$penetrance_lqt2, xlab ="Residue", ylab = "LQT2 Penetrance Estimate")
xrange <- seq(min(fit$x), max(fit$x), length.out = 100)
ps <- predict(fit, xrange, se=T)
lines(xrange, ps$fit*1, lwd=5)
lines(xrange, (ps$fit+1.96*ps$se.fit)*1, lty=2, lwd=4)
lines(xrange, (ps$fit-1.96*ps$se.fit)*1, lty=2, lwd=4)With the updated empirical priors applied to carrier counts, calculate “LQTS probability density” as described in previous publication.
# NOTE: adjust 3rd argument given to funcdist, "d,", to d[d$total_carriers>1,] when
# evaluating ROC of total_carriers == 1
h2.covariates[, "lqt2_dist"]<-NA
h2.covariates[, "lqt2_dist_weight"]<-NA
mut_type <- mut_type[mut_type$mut_type == "missense" & mut_type$isoform == "A",]
for(rec in 1:nrow(mut_type)){
if (!is.na(mut_type[rec, "resnum"])) {
l<-length(h2.covariates[h2.covariates$var == mut_type[rec, "var"],"var"])
for (m in 1:l){
h2.covariates[h2.covariates$var == mut_type[rec, "var"],
c("lqt2_dist", "lqt2_dist_weight", "resnum")][m,] <- c(funcdist(mut_type[rec, "resnum"], mut_type[rec, "var"], d, h2dist, "penetrance_lqt2", "sigmoid", 7), mut_type[rec, "resnum"])
}
}
}
# Plot lqt2_dist versus residue number
h2.covariates$resnum<-as.integer(h2.covariates$resnum)
h2.covariates <- h2.covariates[order(h2.covariates$resnum),]
h2.covariates <- h2.covariates[!is.na(h2.covariates$resnum),]
m<- h2.covariates %>%
select(resnum, pmean = lqt2_dist) %>%
mutate(p_mean_smooth = rollmean(pmean, k=20, fill = NA))
fit <- loess(h2.covariates[,"lqt2_dist"]~as.numeric(h2.covariates[,"resnum"]), span = 0.15)
plot(h2.covariates$resnum, h2.covariates$lqt2_dist, xlab ="Residue", ylab = "LQT2 Penetrance Density", xlim=c(0,1160))
xrange <- seq(min(fit$x), max(fit$x), length.out = 100)
ps <- predict(fit, xrange, se=T)
lines(xrange, ps$fit*1, lwd=5)
lines(xrange, (ps$fit+1.96*ps$se.fit)*1, lty=2, lwd=4)
lines(xrange, (ps$fit-1.96*ps$se.fit)*1, lty=2, lwd=4)Evaluate weighted Spearman correlations coefficients between observed LQT2 diagnosis probability in the literature and various potential predictors
# Merge "d" with full variant list and set carrier counts to 0.
# This is done for convenience so we can estimate LQT2 diagnosis probability for all variants including
# those witheld during model construction. This will make validation easier.
d <- merge(d, h2.covariates, all = TRUE)
d <- unique(d)
calcPval=function(xName,yName,weightName,nPerms,new.mat2){
# Pulls out variables
x=new.mat2[,xName]
y=new.mat2[,yName]
w=new.mat2[,weightName]
x2=x[!is.na(x)]
y2=y[!is.na(x)]
w2=w[!is.na(x)]
# Calculate the real correlation
realCorr=weightedCorr(x2,y2,method='spearman',weights=w2)
# Do permutations, calculate fake correlations
permutedCorrList=c()
for(permNum in 1:nPerms){
permutedX=sample(x2,length(x2),replace=FALSE)
wCorrSim=weightedCorr(permutedX,y2,method='spearman',weights=w2)
permutedCorrList=c(permutedCorrList,wCorrSim)
}
permutedCorrList2=abs(permutedCorrList)
realCorr2=abs(realCorr)
# Calculate pvalue
summ=sum(realCorr2<permutedCorrList2)
pValue=summ/nPerms
return(list(realCorr,pValue,length(x2)))
}
calcAllPvals=function(yList,xList,nPerms,weightName,new.mat2){
i=0
resultTable=data.frame()
for(yName in yList){
for(xName in xList){
i=i+1
result=calcPval(xName,yName,weightName,nPerms,new.mat2)
resultTable[i,'x']=xName
resultTable[i,'y']=yName
resultTable[i,'nPerms']=nPerms
resultTable[i,'weightedCorr']=result[[1]]
resultTable[i,'pValue']=result[[2]]
resultTable[i,'n']=result[[3]]
#print(resultTable[i,'pValue'])
}
}
print(resultTable)
return(resultTable)
}
yList=c('penetrance_lqt2')
xList=c('hm_ssPeak','hm_tailPeak','hm_vhalfact','hm_vhalfinact','hm_recovfrominact', 'hm_taudeact_fast',
'ht_ssPeak','ht_tailPeak','ht_vhalfact','ht_vhalfinact','ht_recovfrominact', 'ht_taudeact_fast',
'pph2_prob', 'provean_score', "blast_pssm",
'pamscore', 'aasimilaritymat', "lqt2_dist", "revel_score")
tmp<-d[!is.na(d$penetrance_lqt2) & !is.na(d$revel_score),]
resultTable<-calcAllPvals(yList, xList, 1000, 'weight', tmp)## x y nPerms weightedCorr pValue n
## 1 hm_ssPeak penetrance_lqt2 1000 0.319565106 0.262 20
## 2 hm_tailPeak penetrance_lqt2 1000 -0.564758635 0.000 100
## 3 hm_vhalfact penetrance_lqt2 1000 0.147612664 0.291 68
## 4 hm_vhalfinact penetrance_lqt2 1000 0.669666858 0.000 29
## 5 hm_recovfrominact penetrance_lqt2 1000 -0.734693865 0.011 12
## 6 hm_taudeact_fast penetrance_lqt2 1000 -0.375296727 0.034 42
## 7 ht_ssPeak penetrance_lqt2 1000 -0.464626144 0.023 34
## 8 ht_tailPeak penetrance_lqt2 1000 -0.616133062 0.000 83
## 9 ht_vhalfact penetrance_lqt2 1000 -0.061932733 0.736 41
## 10 ht_vhalfinact penetrance_lqt2 1000 -0.173738913 0.454 26
## 11 ht_recovfrominact penetrance_lqt2 1000 -0.407537689 0.378 6
## 12 ht_taudeact_fast penetrance_lqt2 1000 0.009703229 0.971 14
## 13 pph2_prob penetrance_lqt2 1000 0.392919452 0.000 741
## 14 provean_score penetrance_lqt2 1000 -0.585053878 0.000 741
## 15 blast_pssm penetrance_lqt2 1000 -0.250452543 0.000 741
## 16 pamscore penetrance_lqt2 1000 -0.116758403 0.026 750
## 17 aasimilaritymat penetrance_lqt2 1000 0.018401459 0.703 750
## 18 lqt2_dist penetrance_lqt2 1000 0.583417065 0.000 748
## 19 revel_score penetrance_lqt2 1000 0.664409462 0.000 750
tmp<-d[!is.na(d$ht_tailPeak) & !is.na(d$penetrance_lqt2) & !is.na(d$revel_score),]
resultTable<-calcAllPvals(yList, xList, 1000, 'weight', tmp)## x y nPerms weightedCorr pValue n
## 1 hm_ssPeak penetrance_lqt2 1000 0.690504615 0.088 10
## 2 hm_tailPeak penetrance_lqt2 1000 -0.428867090 0.004 66
## 3 hm_vhalfact penetrance_lqt2 1000 -0.093938286 0.627 35
## 4 hm_vhalfinact penetrance_lqt2 1000 0.629956576 0.048 15
## 5 hm_recovfrominact penetrance_lqt2 1000 -0.772690799 0.286 4
## 6 hm_taudeact_fast penetrance_lqt2 1000 -0.228864954 0.405 17
## 7 ht_ssPeak penetrance_lqt2 1000 -0.464626144 0.012 34
## 8 ht_tailPeak penetrance_lqt2 1000 -0.616133062 0.000 83
## 9 ht_vhalfact penetrance_lqt2 1000 -0.061096836 0.744 40
## 10 ht_vhalfinact penetrance_lqt2 1000 -0.173348992 0.443 25
## 11 ht_recovfrominact penetrance_lqt2 1000 -0.407537689 0.402 6
## 12 ht_taudeact_fast penetrance_lqt2 1000 0.009703229 0.970 14
## 13 pph2_prob penetrance_lqt2 1000 0.332107233 0.006 81
## 14 provean_score penetrance_lqt2 1000 -0.455185289 0.000 81
## 15 blast_pssm penetrance_lqt2 1000 0.055910704 0.687 81
## 16 pamscore penetrance_lqt2 1000 0.058560727 0.628 83
## 17 aasimilaritymat penetrance_lqt2 1000 0.063049344 0.628 83
## 18 lqt2_dist penetrance_lqt2 1000 0.755839461 0.000 83
## 19 revel_score penetrance_lqt2 1000 0.612183017 0.000 83
Use an EM algorithm to estimate LQT2 PPV for each variant (or LQT2 diagnosis probability)
# Literature dataset where potentially overlapping carriers/heterozygotes are removed
d <- lit.nonoverlap.data[lit.nonoverlap.data$mut_type == "missense",]
# set initial weighting and penetrance
d$weight = 1-1/(0.01+d$total_carriers)
d$penetrance_lqt2 <- d$lqt2/d$total_carriers
d[d$total_carriers < 2,"weight"] <- 0.000 # This is changed to "< 2" here to evaluate ROC-AUC of n=1 variants from the literature
# set initial weighting and penetrance
d$weight = 1-1/(0.01+d$total_carriers)
d$penetrance_lqt2 <- d$lqt2/d$total_carriers
d[d$total_carriers < 1,"weight"] <- 0.000 # This is changed to "< 2" when evaluating ROC-AUC of n=1 variants from the literatureUse observed LQT2 diagnosis probability to calculate “LQTS probability density” as described in previous publication. Plot LQTS probability density versus residue
# Weighted mean to determine LQT2 penetrance empirical prior
newdata = data.frame(wt=1)
model <- lm(penetrance_lqt2 ~ 1, data=d, weights = d$weight)
summary(model)##
## Call:
## lm(formula = penetrance_lqt2 ~ 1, data = d, weights = d$weight)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -0.2329 -0.1653 -0.0232 0.0763 0.7561
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2332 0.0135 17.27 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2323 on 789 degrees of freedom
## (467 observations deleted due to missingness)
p<-predict(model, newdata)
dev<- mse(model) #p*(1-p)
# Estimated shape parameters for LQT2 empirical prior
alpha0 = estBetaParams(p,dev)$alpha
beta0 = estBetaParams(p,dev)$beta
print(paste("alpha0 = ", alpha0, " beta0 = ", beta0))## [1] "alpha0 = 0.54041284982238 beta0 = 1.77720039413882"
With the updated empirical priors applied to carrier counts, calculate “LQTS probability density” as described in previous publication.
h2.covariates[, "lqt2_dist"]<-NA
h2.covariates[, "lqt2_dist_weight"]<-NA
mut_type <- mut_type[mut_type$mut_type == "missense" & mut_type$isoform == "A",]
for(rec in 1:nrow(mut_type)){
if (!is.na(mut_type[rec, "resnum"])) {
h2.covariates[h2.covariates$var == mut_type[rec, "var"],
c("lqt2_dist", "lqt2_dist_weight", "resnum")] <- c(funcdist(mut_type[rec, "resnum"], mut_type[rec, "var"], d[d$total_carriers>1,], h2dist, "penetrance_lqt2", "sigmoid", 7), mut_type[rec, "resnum"])
}
}
# Plot lqt2_dist versus residue number
h2.covariates$resnum<-as.integer(h2.covariates$resnum)
h2.covariates <- h2.covariates[order(h2.covariates$resnum),]
h2.covariates <- h2.covariates[!is.na(h2.covariates$resnum),]
# Merge "d" with full variant list and set carrier counts to 0.
# This is done for convenience so we can estimate LQT2 penetrance for all variants including
# those witheld during model construction. This will make validation easier.
d <- merge(d, h2.covariates, all = TRUE)
d <- unique(d) Use an EM algorithm to estimate LQT2 PPV for each variant (or LQT2 diagnosis probability)
# Literature dataset where potentially overlapping carriers/heterozygotes are removed
d <- lit.cohort.data[lit.cohort.data$mut_type == "missense",]
# add all possible variants
allvariants<-data.frame(unlist(h2.covariates[!is.na(h2.covariates$cardiacboost),"var"]), stringsAsFactors = F)
names(allvariants)<-"var"
allvariants$isoform<-unlist(h2.covariates[!is.na(h2.covariates$cardiacboost),"isoform"])
allvariants$mut_type<-unlist(h2.covariates[!is.na(h2.covariates$cardiacboost),"mut_type"])
a<-merge(d,allvariants,all = T)
a[is.na(a$total_carriers),"total_carriers"] <- 0
a[is.na(a$lqt2),"lqt2"] <- 0
d<-a
# set initial weighting and penetrance
d$weight = 1-1/(0.01+d$total_carriers)
d$penetrance_lqt2 <- d$lqt2/d$total_carriers
d[d$total_carriers < 1,"weight"] <- 0.000 # This is changed to "< 2" here to evaluate ROC-AUC of n=1 variants from the literatureUse observed LQT2 diagnosis probability to calculate “LQTS probability density” as described in previous publication. Plot diagnosis probability density versus residue
# Mean squared error
mse <- function(sm) {
mean((sm$residuals)^2*(sm$weights))
}
# Derive alpha and beta from weighted mean and MSE (estimated variance)
estBetaParams <- function(mu, var) {
alpha <- ((1 - mu) / var - 1 / mu) * mu ^ 2
beta <- alpha * (1 / mu - 1)
return(params = list(alpha = alpha, beta = beta))
}
# Weighted mean to determine LQT2 penetrance empirical prior
newdata = data.frame(wt=1)
model <- lm(penetrance_lqt2 ~ 1, data=d, weights = d$weight)
summary(model)##
## Call:
## lm(formula = penetrance_lqt2 ~ 1, data = d, weights = d$weight)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -0.3424 -0.2429 -0.0341 0.0654 0.6315
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.34273 0.01253 27.35 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2541 on 1017 degrees of freedom
## (6240 observations deleted due to missingness)
p<-predict(model, newdata)
dev<- mse(model) #p*(1-p)
# Estimated shape parameters for LQT2 empirical prior
alpha0 = estBetaParams(p,dev)$alpha
beta0 = estBetaParams(p,dev)$beta
print(paste("alpha0 = ", alpha0, " beta0 = ", beta0))## [1] "alpha0 = 0.854290338628253 beta0 = 1.63833103141397"
With the updated empirical priors applied to carrier counts, calculate “LQTS probability density” as described in previous publication. !!! NOTE: since these data are truly the “best estimates” we include all variants in the calculation such that unique scores are by residue not by variant.
h2.covariates[, "lqt2_dist"]<-NA
h2.covariates[, "lqt2_dist_weight"]<-NA
ld<-0
for(rec in seq(2,1159,1)){
#print(rec)
ld <- funcdist(rec, "var", d[!is.na(d$total_carriers) & d$total_carriers>0 & d$isoform == "A" & d$mut_type != "nonsense",], h2dist, "penetrance_lqt2", "sigmoid", 7)
h2.covariates[!is.na(h2.covariates$isoform) & !is.na(h2.covariates$resnum) & h2.covariates$isoform=="A" & h2.covariates$resnum == rec & !is.na(h2.covariates$var) & !is.na(h2.covariates$mut_type) & h2.covariates$mut_type == "missense","lqt2_dist"] <- ld[1]
h2.covariates[!is.na(h2.covariates$isoform) & !is.na(h2.covariates$resnum) & h2.covariates$isoform=="A" & h2.covariates$resnum == rec & !is.na(h2.covariates$var)& !is.na(h2.covariates$mut_type) & h2.covariates$mut_type == "missense","lqt2_dist_weight"] <-ld[2]
}
# Plot lqt2_dist versus residue number
h2.covariates$resnum<-as.integer(h2.covariates$resnum)
h2.covariates <- h2.covariates[order(h2.covariates$resnum),]
h2.covariates <- h2.covariates[!is.na(h2.covariates$resnum),]
# Merge "d" with full variant list and set carrier counts to 0.
# This is done for convenience so we can estimate LQT2 penetrance for all variants including
# those witheld during model construction. This will make validation easier.
d <- merge(d, h2.covariates, all = TRUE)
d <- unique(d)
# annotate structural location (hotspot)
d$Structure<-NA
d[!is.na(d$lqt2_dist) & d$lqt2_dist<0.1,"Structure"]<-"Non_Hotspot"
d[!is.na(d$lqt2_dist) & d$lqt2_dist>=0.1 & d$lqt2_dist<0.4,"Structure"]<-"Mild_Hotspot"
d[!is.na(d$lqt2_dist) & d$lqt2_dist>=0.4,"Structure"]<-"Hotspot"
# annotate functional perturbation
d$Function<-NA
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak<0.25,"Function"]<-"Severe Dominant LOF"
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak<0.5 & d$ht_tailPeak>=0.25,"Function"]<-"Dominant LOF"
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak>=0.5 & d$ht_tailPeak<0.75,"Function"]<-"LOF"
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak>=0.75 & d$ht_tailPeak<1.25,"Function"]<-"Normal"
d[!is.na(d$ht_tailPeak) & d$ht_tailPeak>=1.25,"Function"]<-"GOF"Use an EM algorithm to estimate LQT2 PPV for each variant (or LQT2 diagnosis probability)
load("prepared_data.RData")
load("lit_all_data_checkpoint.RData")
cohort.data$weight = 1-1/(0.01+cohort.data$total_carriers)
cohort.data$penetrance_lqt2 <- cohort.data$lqt2/cohort.data$total_carriers
for (variant in cohort.data$var) {
if (!is.na(match(variant, d$var))) {
cohort.data[cohort.data$var == variant, c("p_mean_w","alpha", "beta", "lqt2_lit", "unaff_lit", "total_carriers_lit", "lqt2_dist")] <- d[match(variant, d$var), c("p_mean_w", "alpha", "beta", "lqt2", "unaff", "total_carriers", "lqt2_dist")]
}
}
m<- cohort.data %>%
select(resnum, pmean = penetrance_lqt2) %>%
mutate(p_mean_smooth = rollmean(pmean, k=20, fill = NA))
fit <- loess(cohort.data[,"penetrance_lqt2"]~as.numeric(cohort.data[,"resnum"]), span = 0.15)
plot(cohort.data$resnum, cohort.data$penetrance_lqt2, xlab ="Residue", ylab = "LQT2 diagnosis probability", col = "red", pch=19)
xrange <- seq(min(fit$x), max(fit$x), length.out = 100)
ps <- predict(fit, xrange, se=T)
lines(xrange, ps$fit*1, lwd=5)
lines(xrange, (ps$fit+1.96*ps$se.fit)*1, lty=2, lwd=4)
lines(xrange, (ps$fit-1.96*ps$se.fit)*1, lty=2, lwd=4)Use the observed diagnosis probability from as the TRUE diagnosis probability, generate n binomial observations
Use the final EM algorithm posterior as the prior for Beta-Binomial, incorporate data from step (1), generate the posterior distribution, and get 95% credible interval.
Check whether the interval cover the true diagnosis probability from Step 1.
Repeat Step 1 to Step 3 N times to get the coverage rate.
BootsCoverage <- function(var,n=100,N=1000,true){
# var: variant name
# n: number of subjects in the new data
# N: number of Bootstrap
# extract the "true" diagnosis probability
true.p <- d[d$var==var,true]
# generate binomial data
event <- rbinom(N,n,true.p)
# get the posterior credible interval
alpha <- d$alpha[which(d$var==var)]
beta <- d$beta[which(d$var==var)]
new.alpha <- alpha + event
new.beta <- beta + n - event
lb <- qbeta(0.025,new.alpha,new.beta)
ub <- qbeta(0.975,new.alpha,new.beta)
# change lb to floor of nearest 0.1
lb <- floor(lb*20)/20
ub <- ceiling(ub*20)/20
return(sum(lb < true.p & ub > true.p)/N)
}The coverage plot where observed diagnosis probability is the “true” diagnosis probability and one hundred new observations are added is shown below.
# load data, literature + gnomAD + cohort
load("lit_all_data_checkpoint.RData")
load("cohort_checkpoint.RData")
calcPval=function(xName,yName,weightName,nPerms,new.mat2){
# Pulls out variables
x=new.mat2[,xName]
y=new.mat2[,yName]
w=new.mat2[,weightName]
x2=x[!is.na(x)]
y2=y[!is.na(x)]
w2=w[!is.na(x)]
# Calculate the real correlation
realCorr=weightedCorr(x2,y2,method='spearman',weights=w2)
# Do permutations, calculate fake correlations
permutedCorrList=c()
for(permNum in 1:nPerms){
permutedX=sample(x2,length(x2),replace=FALSE)
wCorrSim=weightedCorr(permutedX,y2,method='spearman',weights=w2)
permutedCorrList=c(permutedCorrList,wCorrSim)
}
permutedCorrList2=abs(permutedCorrList)
realCorr2=abs(realCorr)
# Calculate pvalue
summ=sum(realCorr2<permutedCorrList2)
pValue=summ/nPerms
return(list(realCorr,pValue,length(x2)))
}
calcAllPvals=function(yList,xList,nPerms,weightName,new.mat2){
i=0
resultTable=data.frame()
for(yName in yList){
for(xName in xList){
i=i+1
result=calcPval(xName,yName,weightName,nPerms,new.mat2)
resultTable[i,'x']=xName
resultTable[i,'y']=yName
resultTable[i,'nPerms']=nPerms
resultTable[i,'weightedCorr']=result[[1]]
resultTable[i,'pValue']=result[[2]]
resultTable[i,'n']=result[[3]]
#print(resultTable[i,'pValue'])
}
}
print(resultTable)
return(resultTable)
}
# Select covariates from isoform "A" in cohort dataset
cohort.data <- cohort.data[!is.na(cohort.data$total_carriers) & cohort.data$isoform == "A" & cohort.data$mut_type == "missense",]
tmp<-cohort.data[!is.na(cohort.data$provean_score) & !is.na(cohort.data$revel_score) & !is.na(cohort.data$pamscore),]
yList=c('penetrance_lqt2')
xList=c('hm_ssPeak','hm_tailPeak','hm_vhalfact','hm_vhalfinact','hm_recovfrominact', 'hm_taudeact_fast',
'ht_ssPeak','ht_tailPeak','ht_vhalfact','ht_vhalfinact','ht_recovfrominact', 'ht_taudeact_fast',
'pph2_prob', 'provean_score', "blast_pssm",
'pamscore', 'aasimilaritymat', "lqt2_dist", "revel_score", 'p_mean_w')
resultTable<-calcAllPvals(yList, xList, 1000, 'weight', tmp)## x y nPerms weightedCorr pValue n
## 1 hm_ssPeak penetrance_lqt2 1000 -0.13782555 0.797 8
## 2 hm_tailPeak penetrance_lqt2 1000 -0.45322142 0.007 46
## 3 hm_vhalfact penetrance_lqt2 1000 0.24454237 0.235 35
## 4 hm_vhalfinact penetrance_lqt2 1000 0.58824513 0.109 13
## 5 hm_recovfrominact penetrance_lqt2 1000 -0.01793348 0.973 8
## 6 hm_taudeact_fast penetrance_lqt2 1000 0.09551958 0.726 26
## 7 ht_ssPeak penetrance_lqt2 1000 -0.59432412 0.074 11
## 8 ht_tailPeak penetrance_lqt2 1000 -0.72598217 0.000 39
## 9 ht_vhalfact penetrance_lqt2 1000 0.04702470 0.833 22
## 10 ht_vhalfinact penetrance_lqt2 1000 -0.51551414 0.076 14
## 11 ht_recovfrominact penetrance_lqt2 1000 0.31291231 0.531 4
## 12 ht_taudeact_fast penetrance_lqt2 1000 0.40856426 0.188 13
## 13 pph2_prob penetrance_lqt2 1000 0.29038377 0.000 268
## 14 provean_score penetrance_lqt2 1000 -0.38127676 0.000 268
## 15 blast_pssm penetrance_lqt2 1000 -0.07798665 0.272 268
## 16 pamscore penetrance_lqt2 1000 -0.05859981 0.420 268
## 17 aasimilaritymat penetrance_lqt2 1000 0.12912560 0.086 268
## 18 lqt2_dist penetrance_lqt2 1000 0.46761571 0.000 268
## 19 revel_score penetrance_lqt2 1000 0.45089391 0.000 268
## 20 p_mean_w penetrance_lqt2 1000 0.51861171 0.000 268
rm(tmp)
rm(t)
i=0
tmp<-data.frame()
for (x in xList){
i=i+2
tmp[i-1,"Feature"]<-x
t<-d[!is.na(d[,x]) & d$total_carriers>0,]
t<-t[!is.na(t[,"var"]),]
tmp[i,"Feature"]<-paste(x,"_cohort")
t<-d[!is.na(d[,x]) & d$total_carriers>0,]
t<-t[!is.na(t[,"var"]),]
foo <- boot(t, function(data,indices)
weightedCorr(t[indices,x],t$penetrance_lqt2[indices], method="spearman", weights = t$weight[indices]), R=1000)
tmp[i-1,"Spearman"]<-foo$t0
tmp[i-1,"Spearman_low"]<-quantile(foo$t,c(0.025,0.975), na.rm = T)[1][[1]]
tmp[i-1,"Spearman_high"]<-quantile(foo$t,c(0.025,0.975), na.rm = T)[2][[1]]
tmp[i-1,"n"]<-length(t[,x])
t<-cohort.data[!is.na(cohort.data[,x]) & cohort.data$total_carriers>0,]
t<-t[!is.na(t[,"var"]),]
foo <- boot(t, function(data,indices)
weightedCorr(t[indices,x],t$penetrance_lqt2[indices], method="spearman", weights = t$weight[indices]), R=1000)
tmp[i,"Spearman"]<-foo$t0
tmp[i,"Spearman_low"]<-quantile(foo$t,c(0.025,0.975), na.rm = T)[1][[1]]
tmp[i,"Spearman_high"]<-quantile(foo$t,c(0.025,0.975), na.rm = T)[2][[1]]
tmp[i,"n"]<-length(t[,x])
}
forestplot(tmp$Feature,tmp$Spearman,tmp$Spearman_low,tmp$Spearman_high)# reset "tmp" variable
tmp<-cohort.data[!is.na(cohort.data$provean_score) & !is.na(cohort.data$revel_score) & !is.na(cohort.data$pamscore),]
# Weighted R2 between observed LQT2 penetrance and post-test probability
foo <- boot(tmp, function(data,indices)
weightedCorr(tmp$p_mean_w[indices],tmp$penetrance_lqt2[indices], method="pearson", weights = tmp$weight[indices])^2, R=1000)
print("EM estimated LQT2 diagnosis probability versus observed cohort LQT2 diagnosis probability")## [1] "EM estimated LQT2 diagnosis probability versus observed cohort LQT2 diagnosis probability"
## [1] 0.3102379
## 2.5% 97.5%
## 0.1949797 0.4221402
model <- lm(penetrance_lqt2~lqt2_dist, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("LQT2 diagnosis probability density versus observed cohort LQT2 diagnosis probability")## [1] "LQT2 diagnosis probability density versus observed cohort LQT2 diagnosis probability"
## [1] 0.2386498
## 2.5% 97.5%
## 0.1379722 0.3462039
model <- lm(penetrance_lqt2~revel_score, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("REVEL score versus observed cohort LQT2 diagnosis probability")## [1] "REVEL score versus observed cohort LQT2 diagnosis probability"
## [1] 0.2200169
## 2.5% 97.5%
## 0.1273068 0.3306973
# Evaluate only variants with Heterozygous peak tail current measured.
tmp<-cohort.data[!is.na(cohort.data$ht_tailPeak),]
resultTable<-calcAllPvals(yList, xList, 1000, 'weight', tmp)## x y nPerms weightedCorr pValue n
## 1 hm_ssPeak penetrance_lqt2 1000 0.97286492 0.304 4
## 2 hm_tailPeak penetrance_lqt2 1000 -0.25115090 0.257 28
## 3 hm_vhalfact penetrance_lqt2 1000 0.10800601 0.699 15
## 4 hm_vhalfinact penetrance_lqt2 1000 -0.58787950 0.307 6
## 5 hm_recovfrominact penetrance_lqt2 1000 -1.00000000 0.000 2
## 6 hm_taudeact_fast penetrance_lqt2 1000 0.15426767 0.809 6
## 7 ht_ssPeak penetrance_lqt2 1000 -0.66768740 0.044 12
## 8 ht_tailPeak penetrance_lqt2 1000 -0.73814391 0.000 40
## 9 ht_vhalfact penetrance_lqt2 1000 0.22559402 0.352 21
## 10 ht_vhalfinact penetrance_lqt2 1000 -0.57071390 0.042 15
## 11 ht_recovfrominact penetrance_lqt2 1000 0.31291231 0.515 4
## 12 ht_taudeact_fast penetrance_lqt2 1000 0.55832008 0.087 11
## 13 pph2_prob penetrance_lqt2 1000 0.29181628 0.095 39
## 14 provean_score penetrance_lqt2 1000 -0.31546899 0.072 39
## 15 blast_pssm penetrance_lqt2 1000 0.20233110 0.277 39
## 16 pamscore penetrance_lqt2 1000 -0.02844874 0.883 40
## 17 aasimilaritymat penetrance_lqt2 1000 0.28888518 0.087 40
## 18 lqt2_dist penetrance_lqt2 1000 0.67282397 0.000 40
## 19 revel_score penetrance_lqt2 1000 0.72321081 0.000 40
## 20 p_mean_w penetrance_lqt2 1000 0.76322146 0.000 40
foo <- boot(tmp, function(data,indices)
weightedCorr(tmp$p_mean_w[indices],tmp$penetrance_lqt2[indices], method="pearson", weights = tmp$weight[indices])^2, R=1000)
print("EM estimated LQT2 diagnosis probability versus observed cohort LQT2 diagnosis probability")## [1] "EM estimated LQT2 diagnosis probability versus observed cohort LQT2 diagnosis probability"
## [1] 0.5669383
## 2.5% 97.5%
## 0.3108545 0.7676876
model <- lm(penetrance_lqt2~ht_tailPeak, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("Heterozygously measured peak tail current versus observed cohort LQT2 diagnosis probability")## [1] "Heterozygously measured peak tail current versus observed cohort LQT2 diagnosis probability"
## [1] 0.4873147
## 2.5% 97.5%
## 0.2911190 0.6737717
model <- lm(penetrance_lqt2~lqt2_dist, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("LQT2 probability density versus observed cohort LQT2 diagnosis probability")## [1] "LQT2 probability density versus observed cohort LQT2 diagnosis probability"
## [1] 0.4168252
## 2.5% 97.5%
## 0.1532764 0.6646178
model <- lm(penetrance_lqt2~revel_score, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("REVEL score versus observed cohort LQT2 diagnosis probability")## [1] "REVEL score versus observed cohort LQT2 diagnosis probability"
## [1] 0.4662318
## 2.5% 97.5%
## 0.2608208 0.7180945
load("lit_all_data_checkpoint.RData")
tmp<-d[!is.na(d$provean_score) & !is.na(d$penetrance_lqt2) & !is.na(d$lqt2_dist) & !is.na(d$revel_score),]
foo <- boot(tmp, function(data,indices)
weightedCorr(tmp$p_mean_w[indices],tmp$penetrance_lqt2[indices], method="pearson", weights = tmp$weight[indices])^2, R=1000)
print("EM estimated LQT2 diagnosis probability versus observed literature LQT2 diagnosis probability")## [1] "EM estimated LQT2 diagnosis probability versus observed literature LQT2 diagnosis probability"
## [1] 0.8168311
## 2.5% 97.5%
## 0.7699019 0.8555075
model <- lm(penetrance_lqt2~lqt2_dist, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("LQT2 probability density versus observed literature LQT2 diagnosis probability")## [1] "LQT2 probability density versus observed literature LQT2 diagnosis probability"
## [1] 0.4902899
## 2.5% 97.5%
## 0.3922374 0.5897895
model <- lm(penetrance_lqt2~revel_score, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("REVEL versus observed literature LQT2 diagnosis probability")## [1] "REVEL versus observed literature LQT2 diagnosis probability"
## [1] 0.3920345
## 2.5% 97.5%
## 0.3285295 0.4567138
# Evaluate only variants with Heterozygous peak tail current measured.
tmp<-d[!is.na(d$ht_tailPeak) & !is.na(d$provean_score) & !is.na(d$penetrance_lqt2) & !is.nan(d$penetrance_lqt2) & !is.na(d$lqt2_dist),]
foo <- boot(tmp, function(data,indices)
weightedCorr(tmp$p_mean_w[indices],tmp$penetrance_lqt2[indices], method="pearson", weights = tmp$weight[indices])^2, R=1000)
print("EM estimated LQT2 diagnosis probability versus observed literature LQT2 diagnosis probability")## [1] "EM estimated LQT2 diagnosis probability versus observed literature LQT2 diagnosis probability"
## [1] 0.8884971
## 2.5% 97.5%
## 0.8080615 0.9436230
model <- lm(penetrance_lqt2~lqt2_dist, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("LQT2 probability density versus observed literature LQT2 diagnosis probability")## [1] "LQT2 probability density versus observed literature LQT2 diagnosis probability"
## [1] 0.5945224
## 2.5% 97.5%
## 0.4239511 0.7534652
model <- lm(penetrance_lqt2~ht_tailPeak, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("Heterozygous peak tail current versus observed literature LQT2 diagnosis probability")## [1] "Heterozygous peak tail current versus observed literature LQT2 diagnosis probability"
## [1] 0.3390998
## 2.5% 97.5%
## 0.1810048 0.5319460
model <- lm(penetrance_lqt2~revel_score, data = tmp, weights = weight)
mod<-data.frame(model$fitted.values,model$model$penetrance_lqt2,model$weights)
foo <- boot(mod, function(data,indices)
weightedCorr(mod$model.fitted.values[indices],mod$model.model.penetrance_lqt2[indices], method="pearson", weights = mod$model.weights[indices])^2, R=1000)
print("REVEL versus observed literature LQT2 diagnosis probability")## [1] "REVEL versus observed literature LQT2 diagnosis probability"
## [1] 0.4286368
## 2.5% 97.5%
## 0.2332227 0.6124817
AUCs from ROCs predicting EM posteriors
## [1] "276 231"
## [1] "276 229"
## [1] "276 226"
## [1] "276 224"
## [1] "276 217"
## [1] "276 205"
## [1] "276 205"
## [1] "276 202"
## [1] "276 202"
## [1] "276 171"
## [1] "276 169"
## [1] "276 164"
## [1] "276 144"
## [1] "276 144"
## [1] "276 136"
## [1] "741 234"
## [1] "741 231"
## [1] "741 229"
## [1] "741 223"
## [1] "741 218"
## [1] "741 211"
## [1] "741 208"
## [1] "741 205"
## [1] "741 205"
## [1] "741 193"
## [1] "741 192"
## [1] "741 190"
## [1] "741 183"
## [1] "741 179"
## [1] "741 174"
## [1] "362 99"
## [1] "362 99"
## [1] "362 99"
## [1] "362 99"
## [1] "362 99"
## [1] "362 99"
## [1] "362 99"
## [1] "362 99"
## [1] "362 99"
## [1] "362 99"
## [1] "362 99"
## [1] "362 99"
## [1] "362 99"
## [1] "362 99"
## [1] "362 99"
The code to produce the forest plots is shown in the code chunk.
rm(d)
# Loading combined literature, gnomAD, and cohort dataset
load("lit_all_data_checkpoint.RData")
d<-d[d$total_carriers>0 & d$mut_type!="nonsense" & d$isoform=="A",]
d<-d[!is.na(d$var),]
mean.post <- (d$alpha + d$lqt2)/(d$alpha+d$beta+d$total_carriers)
mean.prior <- (d$alpha)/(d$alpha+d$beta)
lower.prior <- qbeta(0.025,d$alpha,d$beta)
higher.prior <- qbeta(0.975,d$alpha,d$beta)
lower.post <- qbeta(0.025,d$alpha+d$lqt2,d$beta+d$total_carriers-d$lqt2)
higher.post <- qbeta(0.975,d$alpha+d$lqt2,d$beta+d$total_carriers-d$lqt2)
forest.data.post <- data.frame(variant = d$var, mean=mean.post,
lower=lower.post, higher=higher.post, resnum=d$resnum, tc=d$total_carriers, lqt2=d$lqt2)
forest.data.post$group<-"posterior"
forest.data.prior <- data.frame(variant = d$var, mean=mean.prior,
lower=lower.prior, higher=higher.prior, resnum=d$resnum, tc=d$total_carriers, lqt2=d$lqt2)
forest.data.prior$group<-"prior"
forest.data<-rbind(forest.data.post, forest.data.prior)
forest.data<-forest.data[order(forest.data$resnum, forest.data$variant),]
forest.data$label<-""
forest.data[forest.data$group=="posterior","label"]<-paste(forest.data[forest.data$group=="posterior","lqt2"], "/", forest.data[forest.data$group=="posterior","tc"])
#define colours for dots and bars
dotCOLS = c("#866D4B","#000000")
barCOLS = c("#FFFFFF","#FFFFFF")
plotg <- function(a,b){
fd<-forest.data[a:b,]
png( paste("Plots/", a, "-",b,"pics.png",sep=""),res=300,height=10,width=10,units="in")
p<-ggplot(fd, aes(x=reorder(variant,-resnum), y=mean, ymin=lower, ymax=higher, col=group, fill=group)) +
geom_text(data=fd, aes(x=reorder(variant,-resnum), label=label)) +
#specify position here
geom_linerange(size=2,position=position_dodge(width = 1)) +
geom_hline(yintercept=1, lty=1) +
geom_hline(yintercept=0, lty=1) +
#specify position here too
geom_point(size=2, shape=21, colour="white", stroke = 0.5,position=position_dodge(width = 1)) +
scale_fill_manual(values=barCOLS)+
scale_color_manual(values=dotCOLS)+
scale_y_continuous(name="LQT2 Diagnosis Probability", limits = c(0, 1)) +
coord_flip() +
theme_minimal()
print(p)
dev.off()
}
sapply(0:30*50+1,function(x) plotg(x,x+49) )The forests plots are pasted below, where gold bars are posteriors, and black bars are EM priors. Variant name is given to the left and the number of LQT2 affected / total heterozygote count is given above the dot indicating the posterior mean.